{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '0'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('/home/husein/alxlnet/topics.json') as fopen:\n",
    "    topics = set(json.load(fopen).keys())\n",
    "    \n",
    "list_topics = list(topics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/xlnet/model_utils.py:295: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import xlnet\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from tqdm import tqdm\n",
    "import model_utils\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import sentencepiece as spm\n",
    "from prepro_utils import preprocess_text, encode_ids\n",
    "\n",
    "sp_model = spm.SentencePieceProcessor()\n",
    "sp_model.Load('sp10m.cased.v9.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from malaya.text.function import transformer_textcleaning as cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from prepro_utils import preprocess_text, encode_ids\n",
    "\n",
    "def tokenize_fn(text):\n",
    "    text = preprocess_text(text, lower= False)\n",
    "    return encode_ids(sp_model, text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "SEG_ID_A   = 0\n",
    "SEG_ID_B   = 1\n",
    "SEG_ID_CLS = 2\n",
    "SEG_ID_SEP = 3\n",
    "SEG_ID_PAD = 4\n",
    "\n",
    "special_symbols = {\n",
    "    \"<unk>\"  : 0,\n",
    "    \"<s>\"    : 1,\n",
    "    \"</s>\"   : 2,\n",
    "    \"<cls>\"  : 3,\n",
    "    \"<sep>\"  : 4,\n",
    "    \"<pad>\"  : 5,\n",
    "    \"<mask>\" : 6,\n",
    "    \"<eod>\"  : 7,\n",
    "    \"<eop>\"  : 8,\n",
    "}\n",
    "\n",
    "VOCAB_SIZE = 32000\n",
    "UNK_ID = special_symbols[\"<unk>\"]\n",
    "CLS_ID = special_symbols[\"<cls>\"]\n",
    "SEP_ID = special_symbols[\"<sep>\"]\n",
    "MASK_ID = special_symbols[\"<mask>\"]\n",
    "EOD_ID = special_symbols[\"<eod>\"]\n",
    "\n",
    "def F(left_train):\n",
    "    tokens_a = tokenize_fn(left_train)\n",
    "    segment_id = [SEG_ID_A] * len(tokens_a)\n",
    "    tokens_a.append(SEP_ID)\n",
    "    tokens_a.append(CLS_ID)\n",
    "    segment_id.append(SEG_ID_A)\n",
    "    segment_id.append(SEG_ID_CLS)\n",
    "    input_mask = [0] * len(tokens_a)\n",
    "    return tokens_a, segment_id, input_mask\n",
    "\n",
    "def XY(data):\n",
    "    \n",
    "    if len(set(data[1]) & topics) and random.random() > 0.2:\n",
    "        t = random.choice(data[1])\n",
    "        label = 1\n",
    "    else:\n",
    "        s = (set(data[1]) | set())\n",
    "        t = random.choice(list(topics - s))\n",
    "        label = 0\n",
    "    X = F(cleaning(data[0]))\n",
    "    Y = F(t)\n",
    "    \n",
    "    return X, Y, label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('/home/husein/alxlnet/testset-keyphrase.json') as fopen:\n",
    "    data = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow.keras.preprocessing.sequence import pad_sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/xlnet/xlnet.py:63: The name tf.gfile.Open is deprecated. Please use tf.io.gfile.GFile instead.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "class Parameter:\n",
    "    def __init__(\n",
    "        self,\n",
    "        decay_method,\n",
    "        warmup_steps,\n",
    "        weight_decay,\n",
    "        adam_epsilon,\n",
    "        num_core_per_host,\n",
    "        lr_layer_decay_rate,\n",
    "        use_tpu,\n",
    "        learning_rate,\n",
    "        train_steps,\n",
    "        min_lr_ratio,\n",
    "        clip,\n",
    "        **kwargs\n",
    "    ):\n",
    "        self.decay_method = decay_method\n",
    "        self.warmup_steps = warmup_steps\n",
    "        self.weight_decay = weight_decay\n",
    "        self.adam_epsilon = adam_epsilon\n",
    "        self.num_core_per_host = num_core_per_host\n",
    "        self.lr_layer_decay_rate = lr_layer_decay_rate\n",
    "        self.use_tpu = use_tpu\n",
    "        self.learning_rate = learning_rate\n",
    "        self.train_steps = train_steps\n",
    "        self.min_lr_ratio = min_lr_ratio\n",
    "        self.clip = clip\n",
    "\n",
    "num_train_steps = 300000\n",
    "warmup_proportion = 0.1\n",
    "num_warmup_steps = int(num_train_steps * warmup_proportion)\n",
    "initial_learning_rate = 2e-5\n",
    "\n",
    "kwargs = dict(\n",
    "    is_training = True,\n",
    "    use_tpu = False,\n",
    "    use_bfloat16 = False,\n",
    "    dropout = 0.1,\n",
    "    dropatt = 0.1,\n",
    "    init = 'normal',\n",
    "    init_range = 0.1,\n",
    "    init_std = 0.05,\n",
    "    clamp_len = -1,\n",
    ")\n",
    "\n",
    "xlnet_parameters = xlnet.RunConfig(**kwargs)\n",
    "xlnet_config = xlnet.XLNetConfig(\n",
    "    json_path = 'xlnet-base-29-03-2020/config.json'\n",
    ")\n",
    "training_parameters = dict(\n",
    "    decay_method = 'poly',\n",
    "    train_steps = num_train_steps,\n",
    "    learning_rate = initial_learning_rate,\n",
    "    warmup_steps = num_warmup_steps,\n",
    "    min_lr_ratio = 0.0,\n",
    "    weight_decay = 0.00,\n",
    "    adam_epsilon = 1e-8,\n",
    "    num_core_per_host = 1,\n",
    "    lr_layer_decay_rate = 1,\n",
    "    use_tpu = False,\n",
    "    use_bfloat16 = False,\n",
    "    dropout = 0.1,\n",
    "    dropatt = 0.1,\n",
    "    init = 'normal',\n",
    "    init_range = 0.1,\n",
    "    init_std = 0.05,\n",
    "    clip = 1.0,\n",
    "    clamp_len = -1,\n",
    ")\n",
    "training_parameters = Parameter(**training_parameters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Model:\n",
    "    def __init__(\n",
    "        self,\n",
    "        dimension_output = 2,\n",
    "    ):\n",
    "        self.X = tf.placeholder(tf.int32, [None, None])\n",
    "        self.segment_ids = tf.placeholder(tf.int32, [None, None])\n",
    "        self.input_masks = tf.placeholder(tf.float32, [None, None])\n",
    "        \n",
    "        self.X_b = tf.placeholder(tf.int32, [None, None])\n",
    "        self.segment_ids_b = tf.placeholder(tf.int32, [None, None])\n",
    "        self.input_masks_b = tf.placeholder(tf.float32, [None, None])\n",
    "        \n",
    "        self.Y = tf.placeholder(tf.int32, [None])\n",
    "        \n",
    "        with tf.compat.v1.variable_scope('xlnet', reuse = False):\n",
    "            xlnet_model = xlnet.XLNetModel(\n",
    "                xlnet_config=xlnet_config,\n",
    "                run_config=xlnet_parameters,\n",
    "                input_ids=tf.transpose(self.X, [1, 0]),\n",
    "                seg_ids=tf.transpose(self.segment_ids, [1, 0]),\n",
    "                input_mask=tf.transpose(self.input_masks, [1, 0]))\n",
    "\n",
    "            summary = xlnet_model.get_pooled_out(\"last\", True)\n",
    "            summary = tf.identity(summary, name = 'summary')\n",
    "            self.summary = summary\n",
    "            print(summary)\n",
    "            \n",
    "        with tf.compat.v1.variable_scope('xlnet', reuse = True):\n",
    "            xlnet_model = xlnet.XLNetModel(\n",
    "                xlnet_config=xlnet_config,\n",
    "                run_config=xlnet_parameters,\n",
    "                input_ids=tf.transpose(self.X_b, [1, 0]),\n",
    "                seg_ids=tf.transpose(self.segment_ids_b, [1, 0]),\n",
    "                input_mask=tf.transpose(self.input_masks_b, [1, 0]))\n",
    "            summary_b = xlnet_model.get_pooled_out(\"last\", True)\n",
    "        \n",
    "        vectors_concat = [summary, summary_b, tf.abs(summary - summary_b)]\n",
    "        vectors_concat = tf.concat(vectors_concat, axis = 1)\n",
    "        \n",
    "        self.logits = tf.layers.dense(vectors_concat, dimension_output)\n",
    "        self.logits = tf.identity(self.logits, name = 'logits')\n",
    "        \n",
    "        self.cost = tf.reduce_mean(\n",
    "            tf.nn.sparse_softmax_cross_entropy_with_logits(\n",
    "                logits = self.logits, labels = self.Y\n",
    "            )\n",
    "        )\n",
    "        \n",
    "        correct_pred = tf.equal(\n",
    "            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y\n",
    "        )\n",
    "        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/xlnet/xlnet.py:220: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.\n",
      "\n",
      "WARNING:tensorflow:From /home/husein/xlnet/xlnet.py:220: The name tf.AUTO_REUSE is deprecated. Please use tf.compat.v1.AUTO_REUSE instead.\n",
      "\n",
      "WARNING:tensorflow:From /home/husein/xlnet/modeling.py:453: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.\n",
      "\n",
      "INFO:tensorflow:memory input None\n",
      "INFO:tensorflow:Use float type <dtype: 'float32'>\n",
      "WARNING:tensorflow:From /home/husein/xlnet/modeling.py:460: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.\n",
      "\n",
      "WARNING:tensorflow:From /home/husein/xlnet/modeling.py:535: dropout (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use keras.layers.dropout instead.\n",
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/layers/core.py:271: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use `layer.__call__` method instead.\n",
      "WARNING:tensorflow:\n",
      "The TensorFlow contrib module will not be included in TensorFlow 2.0.\n",
      "For more information, please see:\n",
      "  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n",
      "  * https://github.com/tensorflow/addons\n",
      "  * https://github.com/tensorflow/io (for I/O related ops)\n",
      "If you depend on functionality not listed there, please file an issue.\n",
      "\n",
      "WARNING:tensorflow:From /home/husein/xlnet/modeling.py:67: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use keras.layers.Dense instead.\n",
      "Tensor(\"xlnet/summary:0\", shape=(?, 768), dtype=float32)\n",
      "INFO:tensorflow:memory input None\n",
      "INFO:tensorflow:Use float type <dtype: 'float32'>\n"
     ]
    }
   ],
   "source": [
    "dimension_output = 2\n",
    "\n",
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = Model(\n",
    "    dimension_output,\n",
    ")\n",
    "\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "tvars = tf.trainable_variables()\n",
    "checkpoint = 'xlnet-base-keyphrase/model.ckpt-240000'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Restoring parameters from xlnet-base-keyphrase/model.ckpt-240000\n"
     ]
    }
   ],
   "source": [
    "saver = tf.train.Saver(var_list = tf.trainable_variables())\n",
    "saver.restore(sess, checkpoint)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 625/625 [01:30<00:00,  6.94it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "batch_size = 32\n",
    "real_Y,predict_Y = [], []\n",
    "\n",
    "for i in tqdm(range(0, len(data), batch_size)):\n",
    "    batch = data[i: i + batch_size]\n",
    "    X, segment, mask, X_b, segment_b, mask_b = [], [], [], [], [], []\n",
    "    for k in range(len(batch)):\n",
    "        x = F(batch[k][0])\n",
    "        y = F(batch[k][1])\n",
    "        X.append(x[0])\n",
    "        segment.append(x[1])\n",
    "        mask.append(x[2])\n",
    "        X_b.append(y[0])\n",
    "        segment_b.append(y[1])\n",
    "        mask_b.append(y[2])\n",
    "        \n",
    "    X = pad_sequences(X, padding = 'post')\n",
    "    segment = pad_sequences(segment, padding = 'post', value = 1)\n",
    "    mask = pad_sequences(mask, padding = 'post', value = 4)\n",
    "    X_b = pad_sequences(X_b, padding = 'post')\n",
    "    segment_b = pad_sequences(segment_b, padding = 'post', value = 1)\n",
    "    mask_b = pad_sequences(mask_b, padding = 'post', value = 4)\n",
    "    \n",
    "    batch_y = [b[2] for b in batch]\n",
    "    \n",
    "    predict_Y += np.argmax(sess.run(model.logits,\n",
    "            feed_dict = {\n",
    "                model.X: X,\n",
    "                model.segment_ids: segment,\n",
    "                model.input_masks: mask,\n",
    "                model.X_b: X_b,\n",
    "                model.segment_ids_b: segment_b,\n",
    "                model.input_masks_b: mask_b,\n",
    "            },\n",
    "    ), 1, ).tolist()\n",
    "    \n",
    "    real_Y += batch_y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      " not similar    0.99947   0.99802   0.99874     15133\n",
      "     similar    0.99386   0.99836   0.99610      4867\n",
      "\n",
      "    accuracy                        0.99810     20000\n",
      "   macro avg    0.99667   0.99819   0.99742     20000\n",
      "weighted avg    0.99811   0.99810   0.99810     20000\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "print(\n",
    "    metrics.classification_report(\n",
    "        real_Y, predict_Y, target_names = ['not similar', 'similar'],\n",
    "        digits = 5\n",
    "    )\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
